#!/usr/bin/perl
use strict;
use warnings;
use Bio::AlignIO;
use Bio::Tools::GFF;
use Bio::SeqFeatureI;
use Cwd 'abs_path';
use Getopt::Long;

##############################################
#### Utility Functions #######################

sub revCigar {
	my ($cigar) = @_;
	
	$cigar=~ s/M/M /g;
	$cigar=~ s/I/I /g;
	$cigar=~ s/D/D /g;
	$cigar=~ s/H/H /g;
	$cigar=~ s/S/S /g;
	my @cigarsp = split /\s+/, $cigar;
	@cigarsp= reverse @cigarsp;
	my $cigar_rev = join("", @cigarsp);
	return ($cigar_rev);
}
##############################################

my $conservatoryDir=abs_path(".");
my $genome="";
my $format="";
my $relPosition="";
my $infileName="";
my $outfileName="";
my $tag="";
my $help=0;
my $infile;
########### Parameters
my $upstreamLength;
my $downstreamLength;

GetOptions ("conservatoryDirectory=s" => \$conservatoryDir,
			"genome=s" => \$genome,
			"format=s" => \$format,
			"relPos=s" => \$relPosition,
			"input=s" => \$infileName,
			"output=s" => \$outfileName,
			"tag=s" => \$tag,
			"help" => \$help) or die ("Error in command line arguments\n");
			

if($genome eq "" || $format eq "" || $relPosition eq "" || $help || ($format ne "SAM" && $format ne "MAF") || ($relPosition ne "UP" && $relPosition ne "DOWN")) {
	print "Conservatory version 0.0.1\n\n";
	
	print "convertRelative2absCoordinate --genome <genomeName> --format <SAM|MAF> --relPos <UP|DOWN>\n\n\n";
	print "\t--conservatoryDirectory\t\tPath of the main conservatory directory.\n";
	print "\t--genome\t\tName of the reference genome (REQUIRED).\n";
	print "\t--format\t\tInput format (SAM or MAF) (REQUIRED).\n";
	print "\t\t\t\t(Currently only SAM is implemented)\n";
	print "\t--relPos\t\tCoordinates are relative to UPstream or DOWNstream (REQUIRED).\n";
	print "\t--tag\t\tExtra info to add about gene.\n";
	print "\t--input\t\t\tInput file name (default:STDIN).\n";
	print "\t--output\t\tOutput file name (default:STDOUT).\n";
	exit();
}
### Set up directory and file access
my $genomedbFile = $conservatoryDir . "/genome_database.csv";
my $parameterFile = $conservatoryDir . "/conservatory.parameters.txt";
my $genomeDir ="$conservatoryDir/genomes";
my $footprintFile = "";

####### First, sanity checks. Check to see if directory structure is OK and if programs are installed
####### 
die "ERROR: Cannot find file genome database file ($genomedbFile)\n" unless -e $genomedbFile;
die "ERROR: Conservatory directory structure at ($conservatoryDir) is corrupt\n" unless (-e $genomeDir);
die "ERROR: Cannot find input file ($infileName)\n" unless (-e $infileName || $infileName eq "");

################### Read Genome DB
open(my $genomedb, "<", $genomedbFile);
while(my $curgenomeline= <$genomedb>) {
	chomp $curgenomeline;
	my ($curgenomeName, $curgenomeSpecies,  $curgenomeFamily, $curgenomeReference, $curUpstreamLength, $curDownstreamLength, $geneNameField, $geneProcessingRegEx, $gene2SpeciesIdentifier, $proteinProcessingRegEx) = split /,/, $curgenomeline;

	if($curgenomeName eq $genome) {
		$footprintFile = "$genomeDir/$curgenomeFamily/$genome.footprint.gff3";
		$upstreamLength = $curUpstreamLength * 1000;
		$downstreamLength = $curDownstreamLength * 1000;
	}
}
close($genomedb);
die "ERROR: Cannot find genome $genome in genome database file.\n" unless ($footprintFile ne "");
die "ERROR: Cannot find footprint file ($footprintFile)\n" unless (-e $footprintFile);

if($infileName eq "") {
	$infile = \*STDIN;
} else {
	open($infile, "<", $infileName);
}

my $gff = Bio::Tools::GFF->new(-file => $footprintFile,
							   -gff_version => 3);

my $length;
if($relPosition eq "UP") {
	$length = $upstreamLength;
} else {
	$length = $downstreamLength;
}

my $outbedorthfilename = $ARGV[4];
my $outconservbedfilename = $ARGV[5];
											
my %genePos = (
	'Chr' => "",
	'Start' => 0,
	'End' => 0,
	'Strand' => "",
	'Name' => "");

if($format eq "SAM") {
	while ( my $line =<$infile>) {
		if(substr($line,0,1) eq '@' ) {
			#If this is the gene coordinate. find it.
			if(substr($line,0,3) eq '@SQ') {
				my @fields = split /\t/, $line;
				
				my $genename = $fields[1];
				$genename =~ s/SN://;

				while((my $feature = $gff->next_feature) && $genePos{'Chr'} eq "") {
					my @tags = $feature->get_tag_values("Name");
					if( $tags[0] eq $genename) {
						$genePos{'Chr'} = $feature->seq_id;
						$genePos{'Start'} =$feature->start;
						$genePos{'End'} =$feature->end;
						if ($feature->strand == -1) {
							$genePos{'Strand'} = "-";
						} else {
							$genePos{'Strand'} = "+";
						}
						$genePos{'Name'} = $genename;
					}
				}
				printf("\@SQ\tSN:" . $genePos{'Chr'} . "\tLN:9999999999\n");
				
			} else {
				print $line;
			}
		} else {
			chomp $line;
			my @fields= split /\t/,$line;
	
			my $orthname;
			if($tag eq "") {
				$orthname = $genePos{'Name'} . ":" . $fields[0];
			} else {
				$orthname = $genePos{'Name'} . ":$tag:" . $fields[0];
			}
			$fields[0] = $orthname;
			$fields[2] = $genePos{'Chr'};
			my $aln_start = $fields[3];
			
			my $seq = $fields[9];
			
			## Remove all insertions in order to translate coordinates to reference
			my $curcigar = $fields[5];
			my $newcigar ="";
			my $curPosInReference=1;
			my $curPosInTarget=0;
			my $inserlen=0;
			
			my @cigar_parts = $curcigar =~ /(\d+[MIDNSHP=X])/g;
			for my $cigar_item (@cigar_parts) {
                if( $cigar_item =~ /(\d+)([MIDNSHP=X])/) {
                        my $number = $1;
                        my $action = $2;
                         
                        if($action eq 'M') {
                        	$curPosInReference += $number;
                        	$curPosInTarget += $number;
                        	$newcigar .= $number . $action;
                        } elsif($action eq 'D' || $action eq 'N') {
                        	$curPosInReference += $number;
                        	$newcigar .= $number . $action;
                        	
                        } elsif($action eq 'I' ) {
                            # filter out insertion
                            $inserlen += $number;
                            
                            substr($seq,$curPosInTarget, $number) = 'Z' x $number;
                            $curPosInTarget += $number;
                            
                        }
                }
            }
            
            $seq =~ s/Z//g;          
            my $aln_end = $curPosInReference;

			$fields[5] = $newcigar;
			$fields[9] = $seq;
		
			if($genePos{'Strand'} eq "+") {
				if($relPosition eq "UP") {
					$fields[3] =  $genePos{'Start'}  - $length + $aln_start -1 ;
				} else { #if its downstream
					$fields[3] = $genePos{'End'} + $aln_start + 1;	
				}
			} else {
				$fields[9] = reverse $fields[9];
				$fields[9] =~ tr/ATGCatgc/TACGtacg/;
				# reverse CIGAR...
				$fields[5] = revCigar($fields[5]);

				if($relPosition eq "UP") {
					$fields[3] =  $genePos{'End'} + $length - ($aln_start + $aln_end) +3;
					#adding 3, because the lastz sam format is starting with 1, the aln_end is +1.
				} else {
					$fields[3] = $genePos{'Start'} - ($aln_start + $aln_end) +2;	
				}
			}

			printf ( join("\t", @fields) ."\n");
			#			print "genePos:\t$genePos{'Strand'}\trelPosition:\t$relPosition\tlength:\t$length\taln_start:\t$aln_start\taln_end:\t$aln_end\n";

		}
	}
}
